load("02_09_m5s_join_behav_order_activities.RData")
load("02_09_m5s_join_univ_sum_stats.RData")

# table(order_of_activities$action1)
# blog facebook    forum   meetup 
# 399606  1874947    43280    30675 

order_levels <- c("blog","meetup","facebook","forum")
act_freq_tbl <- list()
n_act <- numeric()
for (i in 1:4) {
  act_freq_tbl[[i]] <- as.data.frame(table(order_of_activities[[paste0('action',i)]]))
  act_freq_tbl[[i]]$Var1 <- factor(act_freq_tbl[[i]]$Var1, levels=order_levels) 
  act_freq_tbl[[i]]$Tot <- sum(act_freq_tbl[[i]]$Freq)
  n_act[i] <- sum(act_freq_tbl[[i]]$Freq)
}

#  Users by first landing and successive steps
do.plot <- function(i){
  lbs <- c("1st activity","2nd activity","3rd activity","4th activity")
  p <- ggplot(act_freq_tbl[[i]], aes(y=Freq/Tot, x=Var1)) + geom_bar(stat="identity") +
    geom_text(aes(label=round(Freq/Tot*100,digits=2)), position=position_dodge(width=0.9), vjust=-0.25, size=3) +
    scale_y_continuous(labels = scales::percent, limits=c(0,0.8)) +
    labs(x=paste0(lbs[i],
                  " (n=",
                  formatC(act_freq_tbl[[i]]$Tot[i], format="d", big.mark=','),
                  ")"), y=NULL)
  return(p)
}
p <- lapply(1:4, do.plot)


## Moving from 
action_type <-  unique(order_of_activities$action1)
users_moving_from <- data.frame(platform=character(),act1=numeric(),act2=numeric(),act3=numeric(),act4=numeric()) 
# Percentage of users moving to different activities
for(act in action_type) {
  print(act)
  tmp <- subset(order_of_activities, action1==act)
  act1 <- nrow(tmp)
  act2 <- sum(!is.na(tmp$action2))
  act3 <- sum(!is.na(tmp$action3))
  act4 <- sum(!is.na(tmp$action4))
  users_moving_from <- rbind(users_moving_from, 
                             data.frame(platform=act,
                                        act1=act1,
                                        act2=act2,
                                        act3=act3,
                                        act4=act4,
                                        sum=act1+act2+act3+act4))
}
users_moving_from$act1_act1 <- (users_moving_from$act1 / users_moving_from$sum)
users_moving_from$act2_act1 <- (users_moving_from$act2 / users_moving_from$sum)
users_moving_from$act3_act2 <- (users_moving_from$act3 / users_moving_from$sum)
users_moving_from$act4_act3 <- (users_moving_from$act4 / users_moving_from$sum)
users_moving_from$platform <- factor(users_moving_from$platform, levels= order_levels)

p2 <- ggplot(melt(users_moving_from[,c(1,7:10)]), aes(x=platform, y=value)) + 
  geom_bar(aes(fill = variable), position = "dodge", stat="identity") +
  scale_y_continuous(labels=scales::percent) +
  scale_fill_discrete(guide = guide_legend(title = NULL), labels = c("as 1st activity",
                                                                     "as 2nd activity",
                                                                     "as 3rd activity",
                                                                     "as 4th activity"))



subsets <- list()
for(act in action_type) {
  subsets[[act]] <- subset(order_of_activities, action1==act)
}


provenance_df <- data.frame()
for(act in action_type) {
  print(act)
  print(table(subsets[[act]]$action2))
  provenance_df <- rbind(provenance_df, data.frame(platform = act,
                                                   facebook1 = nrow(subset(subsets[[act]], action1=="facebook")),
                                                   blog1 = nrow(subset(subsets[[act]], action1=="blog")),
                                                   meetup1 = nrow(subset(subsets[[act]], action1=="meetup")),
                                                   forum1 = nrow(subset(subsets[[act]], action1=="forum")),
                                                   facebook2 = nrow(subset(subsets[[act]], action2=="facebook")),
                                                   blog2 = nrow(subset(subsets[[act]], action2=="blog")),
                                                   meetup2 = nrow(subset(subsets[[act]], action2=="meetup")),
                                                   forum2 = nrow(subset(subsets[[act]], action2=="forum")),
                                                   facebook3 = nrow(subset(subsets[[act]], action3=="facebook")),
                                                   blog3 = nrow(subset(subsets[[act]], action3=="blog")),
                                                   meetup3 = nrow(subset(subsets[[act]], action3=="meetup")),
                                                   forum3 = nrow(subset(subsets[[act]], action3=="forum")),
                                                   facebook4 = nrow(subset(subsets[[act]], action4=="facebook")),
                                                   blog4 = nrow(subset(subsets[[act]], action4=="blog")),
                                                   meetup4 = nrow(subset(subsets[[act]], action4=="meetup")),
                                                   forum4 = nrow(subset(subsets[[act]], action4=="forum"))
  ))
  
}
univ_users <- c(sum_stats[["unique_users_fb"]], sum_stats[["unique_users_blog"]],
                sum_stats[["unique_users_meetup"]], sum_stats[["unique_users_forum"]])

# This is used to build the chart
(provenance_df[,2:5] / univ_users) * 100
(t(provenance_df[,6:9]+provenance_df[,10:13]+provenance_df[,14:17]) / univ_users) * 100
((provenance_df[,6:9]+provenance_df[,10:13]+provenance_df[,14:17]) / diag(as.matrix(provenance_df[,2:5]))) * 100

# Number of users also active on other platforms 
conc_activities <- transform(order_of_activities, string=paste(action1,
                                                               action2,
                                                               action3,
                                                               action4,
                                                               sep=' '))
conc_activities$facebook <-  grepl("facebook", conc_activities$string) 
conc_activities$meetup <-  grepl("meetup", conc_activities$string) 
conc_activities$forum <-  grepl("forum", conc_activities$string) 
conc_activities$blog <-  grepl("blog", conc_activities$string) 

behav_intersect <- list()
for(act in action_type) {
  tmp <- conc_activities[conc_activities[[act]]==TRUE,7:10]
  behav_intersect[[act]] <- as.data.frame(colSums(tmp))
  behav_intersect[[act]]$platform <- row.names(behav_intersect[[act]])
  behav_intersect[[act]]$platform <- factor(behav_intersect[[act]]$platform, levels=order_levels)
  names(behav_intersect[[act]])[1] <- "users"
  behav_intersect[[act]]$tot <- behav_intersect[[act]][act,"users"]
}

require(ineq)
gini_plots <- list()
for(act in order_levels) {
  gini_plots[[act]] <- ggplot(behav_intersect[[act]], aes(x=platform, y=users/tot)) + 
    geom_bar(stat="identity") +
    scale_y_continuous(labels=scales::percent) +
    labs(title=paste0(act, 
                      "\n(n=",
                      formatC(behav_intersect[[act]][act,"users"], format="d", big.mark=','),
                      ", Gini=",
                      round(Gini(behav_intersect[[act]]$users),2),
                      ")"
    ), y=NULL, x=NULL)
}

# Figure 5.2
grid.arrange(gini_plots[[1]], gini_plots[[2]], gini_plots[[3]], gini_plots[[4]], nrow=1)

# Figure 5.3 
do.call(grid.arrange, c(p, nrow=1))

# Figure 5.4
p2